home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
Tools
/
glimpse-2.1
/
index
/
getword.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-05-16
|
6KB
|
201 lines
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
/* ./glimpse/index/getword.c */
#include "glimpse.h"
extern int NextICurrentFileOffset, ICurrentFileOffset;
int StructuredIndex = 0;
int WORD_TOO_LONG = 0;
int IndexNumber = 0;
int CountWords = 0;
int InterpretSpecial = 0;
int indexable_char[256];
int GMAX_WORD_SIZE = MAX_WORD_SIZE;
#define ALL_LOWER 0 /* default, what you start with: all are possible */
#define FIRST_UPPER 1 /* only first one seen is upper: 0 is impossible */
#define ALL_UPPER 2 /* all seen so far are upper: 2 and 3 are possible */
#define MIXED 3 /* neither of the above 3 */
#define ALPHANUM 1
#define ALPHAONLY 2
#define NUMONLY 3
#define INDEXABLE(c) (indexable_char[c])
/* -------------------------------------------------------------------------
getword():
get a word from stream pointed to by buffer.
a word is a string of alpha-numeric characters.
After the word is gotten, return a new pointer that points to a alpha-numeric
character. For the first call to such function when the first character
is not a alpha-numeric character, getword() only adjust the pointer to
point to a alpha-numeric character.
--------------------------------------------------------------------------*/
unsigned char *getword(word, buffer, buffer_end, pattr)
unsigned char *word;
unsigned char *buffer;
unsigned char *buffer_end;
int *pattr;
{
int word_length=0;
unsigned char *wp=word;
unsigned char *old_buffer = buffer;
int previslsq = 0;
int withinsq = 0;
ICurrentFileOffset = NextICurrentFileOffset;
if (pattr != NULL) *pattr = 0;
if (CountWords) { /* don't convert case, ignore special, don't bother about offsets. */
unsigned char *temp_buffer;
int flag = ALL_LOWER;
for(temp_buffer = buffer; (temp_buffer - buffer < GMAX_WORD_SIZE) && (temp_buffer < buffer_end); temp_buffer ++) {
if (!INDEXABLE(*temp_buffer)) break;
if (isupper(*temp_buffer)) {
if (flag == ALL_LOWER) {
if (temp_buffer == buffer) flag = FIRST_UPPER;
else { flag = MIXED; break; }
}
else if (flag == FIRST_UPPER) {
if (temp_buffer == buffer + 1) flag = ALL_UPPER;
else { flag = MIXED; break; }
}
else continue; /* must be ALL_UPPER -> let it remain so */
}
else if (islower(*temp_buffer)) {
if (flag == ALL_LOWER) continue;
else if (flag == FIRST_UPPER) continue;
else if (flag == ALL_UPPER) { flag = MIXED; break; }
}
/* else, not alphabet: ignore */
}
if (flag == MIXED) { /* discard mixed words since they cannot be indexed */
word[0] = '\0';
if (IndexNumber) while(isalnum(*temp_buffer++));
else while(isalpha(*temp_buffer++));
return temp_buffer;
}
while(buffer < buffer_end) {
if(INDEXABLE(*buffer)) {
*word++ = *buffer ++;
word_length++;
}
else {
while((buffer< buffer_end) && !(INDEXABLE(*buffer))) buffer++;
break;
}
if(word_length > GMAX_WORD_SIZE) {
word = wp;
WORD_TOO_LONG = ON;
while(INDEXABLE(*buffer)) buffer++; /* skip current long word */
break;
}
}
}
else { /* convert case, maybe interpret special */
while(buffer < buffer_end) {
if (INDEXABLE(*buffer)) { /* ICurrentFileOffset is in the right place */
if (*buffer == '[') {
previslsq = 1;
withinsq = 1;
}
else {
previslsq = 0;
if (*buffer == ']') withinsq = 0;
}
if ((*buffer == '-') && !withinsq) { /* terminate word here */
buffer ++;
ICurrentFileOffset ++;
break;
}
if (isupper(*buffer)) *word++ = tolower(*buffer++);
else *word++ = *buffer++;
word_length++;
}
else if (INDEXABLE('[') && (*buffer == '^') && previslsq) {
*word ++ = *buffer ++;
word_length ++;
previslsq = 0;
}
else {
previslsq = 0;
if (InterpretSpecial && (*buffer == '\\')) {
/* skip two things AND terminate word HERE */
if (buffer < buffer_end - 1) {
buffer += 2;
if (word_length <= 0) ICurrentFileOffset += 2;
}
else if (buffer < buffer_end) {
buffer ++;
if (word_length <= 0) ICurrentFileOffset ++;
}
}
else {
if (word_length <= 0) while((buffer < buffer_end) && !(INDEXABLE(*buffer))) {
ICurrentFileOffset ++;
buffer++;
}
else while((buffer < buffer_end) && !(INDEXABLE(*buffer))) buffer++;
}
break;
}
if(word_length > GMAX_WORD_SIZE) {
word = wp;
WORD_TOO_LONG = ON;
while(INDEXABLE(*buffer)) buffer++; /* skip current long word */
break;
}
}
}
if(WORD_TOO_LONG) *wp = '\0';
*word = '\0';
WORD_TOO_LONG = 0;
if ((pattr != NULL) && (word_length > 0) && (StructuredIndex))
*pattr = region_identify(ICurrentFileOffset, 0);
NextICurrentFileOffset += (buffer <= old_buffer) ? 1 : (buffer - old_buffer); /* beginning of next word, atleast 1 */
return(buffer);
}
set_indexable_char(indexable_char)
int indexable_char[256];
{
int i;
/* Saves a lot of calls during run-time! */
for (i=0; i<256; i++) {
if(!isascii(i)) indexable_char[i] = 0;
else if(IndexNumber) indexable_char[i] = isalnum(i);
else indexable_char[i] = isalpha(i);
}
indexable_char['_'] = 1;
}
set_special_char(special_char)
int special_char[256];
{
/*
* Set all special characters interpreted by agrep to 1.
* Assume set_indexable_char has been done on it.
*/
special_char['-'] = 1;
/* special_char[','] = 1; */
/* special_char[';'] = 1; */
/* special_char['.'] = 1; */
/* special_char['#'] = 1; */
/* special_char['|'] = 1; */
special_char['['] = 1;
special_char[']'] = 1;
/* special_char['('] = 1; */
/* special_char[')'] = 1; */
/* special_char['>'] = 1; */
/* special_char['<'] = 1; */
/* special_char['^'] = 1; */
/* special_char['$'] = 1; */
/* special_char['+'] = 1; */
}